knitr::opts_chunk$set(echo = TRUE)
if (!require("pacman")) install.packages("pacman")
pacman::p_load(dplyr, ggplot2, stringr, tidyr, data.table, DT, ggjoy, ggmap, plotly, leaflet)
In this notebook, I demonstrate some exploratory work to analyse flight delays at take-off. We can script the public data from the US DoT website (at the following url: http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ ID=236&DB_Short_Name=OnTime) to answer questions such as which airport has the most delays, which one is the most on-time airline etc.
# read in data
flight <- fread("376941973_T_ONTIME.csv", stringsAsFactors = F)
airport_lookup <- fread("AIRPORT_ID.csv", stringsAsFactors = F)
airport_lookup <- airport_lookup[, Code:=as.numeric(Code)] # convert to numeric
airline_lookup <- fread("AIRLINE_Lookup.csv", stringsAsFactors = F)
airline_lookup <- airline_lookup[, Code:=as.numeric(Code)] # convert to numeric
A flight delay is defined as the following:
flight departed at the gate 15 minutes or more after the scheduled departure time
flight arrived at the gate 15 minutes or more after the scheduled arrival time
# get the departure data for each airport
depart.df <- flight %>%
# filter out the NAs in DEP_DELAY_NEW variable
filter(!is.na(DEP_DEL15)) %>%
group_by(ORIGIN_AIRPORT_ID) %>%
summarise(
# calculate the total number of departure airlines
NumOfDEP = n(),
# calculate the total number of departure delays
NumOfDelayDEP = sum(DEP_DEL15),
MedianDelayDEP = median(DEP_DELAY),
AvgDelayDEP = round(mean(DEP_DELAY),2),
OnTimeRateDEP = round(100 - 100*mean(DEP_DEL15),1)
) %>%
arrange(-NumOfDelayDEP) %>% ungroup() %>%
filter(NumOfDEP >=5) %>%
# left join the airport lookup table to get the actual name
left_join(airport_lookup, by = c("ORIGIN_AIRPORT_ID" = "Code")) %>%
# separate the Description to two columns
separate( Description, c("City", "AirportName"), ":")
# get the arrival data for each airport
arrival.df <- flight %>%
# filter out the NAs in DEP_DELAY_NEW variable
filter(!is.na(ARR_DEL15)) %>%
group_by(ORIGIN_AIRPORT_ID) %>%
summarise(
# calculate the total number of arrival airlines for each airport
NumOfARR = n(),
# calculate the total number of arrival delays for each airport
NumOfDelayARR = sum(ARR_DEL15),
MedianDelayARR = median(ARR_DELAY),
AvgDelayARR = round(mean(ARR_DELAY),2),
OnTimeRateARR = round(100 - 100*mean(ARR_DEL15),1)
) %>%
arrange(-NumOfDelayARR) %>% ungroup() %>%
filter(NumOfARR >=5)
# left join airport lookup table to get the actual name
delay.df <- left_join(depart.df, arrival.df, by = "ORIGIN_AIRPORT_ID") %>%
mutate( NumOfAirlines = NumOfDEP + NumOfARR,
NumOfDelay = NumOfDelayDEP + NumOfDelayARR,
# calculate the percentage of airlines for each airport
PerOfAirlines = round(100*NumOfAirlines / sum(NumOfAirlines),2),
# calculate the percentage of delays for each airport
PerOfDelay = round(100*NumOfDelay / sum(NumOfDelay),2)
)
# calculate the 90th percentile of all eligible airports
percentile <- (round(nrow(delay.df) * 0.1,0))
exceed90th <- delay.df %>% arrange(-NumOfDelay) %>% head(.,percentile) %>%
select(AirportName, City, NumOfAirlines, NumOfDelay)
datatable(exceed90th, class = 'cell-border stripe',
caption = 'Table 1: A list of airlines most at fault in the origin airports that exceed the 90th percentile of overall delays in January, 2016')
delayPerTable <- delay.df %>% arrange(-PerOfDelay) %>%
select(AirportName,City,PerOfDelay,NumOfDelay,NumOfAirlines, ORIGIN_AIRPORT_ID)
datatable(delayPerTable, class = 'cell-border stripe',
caption = 'Table 2: A list of the percentage that those airlines most at fault contribute to the overall delays for each origin airport in January, 2016')
For example, we would like to investigate is there any interesting pattern in John F. Kennedy International Airport in New York? Below is a plot of distribution of number of minutes delayed at JFK in January, which shows the variability of minutes delayed for each day. It seems that if you were unlucky to travel on Jan 24th in New York, you would experience an unusual delay for departure.
# top10Delay <- delay.df %>% arrange(-PerOfDelay) %>%
# head(.,10) %>% select(ORIGIN_AIRPORT_ID) %>% as.data.frame() %>% t() %>% as.vector()
# top10.df <- flight %>% filter(ORIGIN_AIRPORT_ID %in% top10Delay)
ggplot(flight %>% filter(DEP_DELAY < 50,
DEP_DELAY > -30,
# only keep JFK
ORIGIN_AIRPORT_ID == 12478),
aes(x = DEP_DELAY, y = factor(FL_DATE))) + geom_joy() +
ylab("Date") + xlab("Number of minutes delayed for departure at JFK airport") +
ggtitle("Distribution of number of minutes delayed at JFK in January")
## Picking joint bandwidth of 2.31
A plot of Percentage of on-time departure vs Percentage of on-time arrival is also shown below. A bigger (busier) airport is expected to have more delays. It’s worth to notice that, for example, Denver International Airport and Chicago O’Hare International Airport have a comparable size of about 38000 flights in January. However, the on-time rate of Denver (85%) is 7% higher than Chicago (77%).
# library(plotly)
plot_ly(
delay.df, x = ~ OnTimeRateDEP, y = ~ OnTimeRateARR,
# Hover text:
text = ~paste(AirportName, ":", NumOfDelay, "delays out of ",NumOfAirlines, "flights in Jan"),
color = ~ NumOfDelay, size = ~ NumOfDelay
) %>% layout(xaxis = list(title = "Percentage of on-time departure (%)"),
yaxis = list( title = "Percentage of on-time arrival (%)")
)
# run this instead to read in geocoded data
geo <- read.csv("geocoded.csv",stringsAsFactors = F)
delay.df$lat <- geo$lat
delay.df$lon <- geo$lon
# not run
# library(ggmap)
# delay.df$address <- paste(delay.df$AirportName,delay.df$City, sep = ",")
# for(i in 1:nrow(delay.df))
# {
# result <- geocode(delay.df$address[i], output = "latlona", source = "google")
# delay.df$lon[i] <- as.numeric(result[1])
# delay.df$lat[i] <- as.numeric(result[2])
# delay.df$geoAddress[i] <- as.character(result[3])
# }
# one manual fix error
# delay.df$lon[273] <- 13.484728
# delay.df$lat[273] <- 144.7993212
# write.csv(delay.df, "geocoded.csv", row.names=FALSE)
Once I have collected the geo-data, I further map the delays of each origin airport to a geographical map where points are colour coded according to the magnitude of overall delays for that origin airport.
# maps numeric input data to a fixed number of output colors using quantiles
# it seems look better than the continuous scale due to the huge difference between
# the max and min value in the data
qpal <- colorQuantile("RdYlBu", delay.df$NumOfDelay, n = 5, reverse=T)
# built an interactive map using leaflet
leaflet(data = delay.df) %>%
# Base groups
addTiles(group = "OSM (default)") %>%
addProviderTiles(providers$Stamen.TonerLite, group = "Toner Lite") %>%
addCircles(
lng = ~lon, lat = ~lat,
# represent NumOfDelay using color and radius
radius = ~NumOfDelay*15,
color = ~qpal(NumOfDelay),
# implement the click function
popup = ~as.character(paste("On-Time % of DEP and ARR:",OnTimeRateDEP," & ",OnTimeRateARR)),
# implement the hover function
label=~as.character(paste(AirportName,"'s delay: ", NumOfDelay, " out of ",NumOfAirlines," flights",sep="")),
labelOptions = labelOptions(noHide = F,direction = 'auto')
) %>%
addLegend("bottomright",pal = qpal, values = ~NumOfDelay,
title = "Quantile of overall delays",
# labFormat = labelFormat(prefix = ""),
opacity = 1
) %>%
# Layers control
addLayersControl(
baseGroups = c("OSM (default)", "Toner Lite"),
options = layersControlOptions(collapsed = FALSE)
) %>%
setView(lng = -96.0589, lat = 32.3601, zoom = 4)